Aim: The goal of this project is to:

Methodology:

The project would lay more emphasis on the explanatory techniques. It will be used in making data presentation to the viewers in a more succinct way. I therefore plan to use the R programing language to explore and analysis the dataset.

The dataset to be used is the World Health Nutrition and Population Statistics from year 2010 to 2016 . This can be obtained from http://databank.worldbank.org/data/reports.aspx?source=health-nutrition-and-population-statistics#advancedDownloadOptions .

Load this libraries and dataset and lets get to work!

suppressMessages(library(knitr))
suppressMessages(library(plyr))
suppressMessages(library(tidyr))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(plotly))
suppressMessages(library(sqldf))
suppressMessages(library(MASS))
suppressMessages(library(reshape2))
suppressMessages(library(Amelia))
suppressMessages(library(mice))
suppressPackageStartupMessages(library(googleVis))
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
suppressMessages(library(ggdendro))

df <- read.csv("WorldHealth.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
kable(head(df[1:10, ]))
Year_Code Country_Name Country_Code Adults_15_living_HIV Adults_Children_0_14_15_living_HIV AIDS_estimated_deaths_UNAIDS Adults_children_0_14_15_newly_infected_HIV Adults_15_newly_infected_HIV Children_0_14_living_with_HIV Children_orphaned_by_HIV_AIDS Children_0_14_newly_infected_HIV Incidence_tuberculosis_per_100000 Labor_force_total Mortality_traffic_injury_100K Population_female Population_male Population_total Malaria_cases_reported Suicide_mortality_per_100K Tuberculosis_death_per_100K Tuberculosis_case_detection Tuberculosis_treatment_success_NewCases
YR2010 Afghanistan AFG 4300 4500 500 1000 1000 200 3000 100 189.0 7707349 15.3 14005473 14797694 28803167 69397 5.1 41.00 53 86
YR2010 Albania ALB 1000 1000 100 200 200 NA NA NA 17.0 1216574 15.4 1451422 1461599 2913021 NA 5.3 0.20 88 89
YR2010 Algeria DZA 9000 9200 500 1000 1000 500 2100 100 77.0 11115092 24.7 17887239 18230398 36117637 1 3.4 8.50 80 89
YR2010 American Samoa ASM NA NA NA NA NA NA NA NA 8.3 NA NA NA NA 55637 NA NA 0.68 87 100
YR2010 Andorra AND NA NA NA NA NA NA NA NA 9.5 NA NA NA NA 84449 NA NA 0.78 87 86
YR2010 Angola AGO 190000 210000 9400 25000 21000 18000 94000 3800 384.0 8394741 20.8 11936016 11433115 23369131 1682870 20.6 58.00 55 48
lat_long <- read.csv("Countries_long_lat2.csv", header = TRUE, sep = ",")
colnames(lat_long) <- c("Country_Name", "Country_Code", "Latitude", "Longtitude")
kable(head(lat_long))
Country_Name Country_Code Latitude Longtitude
Albania ALB 41.0000 20.0000
Algeria DZA 28.0000 3.0000
American Samoa ASM -14.3333 -170.0000
Andorra AND 42.5000 1.6000
Angola AGO -12.5000 18.5000
Anguilla AIA 18.2500 -63.1667
options(warn = -1)
df2 <- merge(df, lat_long, by.x = "Country_Name", by.y = "Country_Name", all = TRUE)
head(df2)
na_count_before <-sapply(df, function(a) sum(is.na(a)))
na_count_after <-sapply(df2, function(b) sum(is.na(b)))

kable(na_count_before)
Year_Code 0
Country_Name 0
Country_Code 0
Adults_15_living_HIV 511
Adults_Children_0_14_15_living_HIV 516
AIDS_estimated_deaths_UNAIDS 540
Adults_children_0_14_15_newly_infected_HIV 576
Adults_15_newly_infected_HIV 571
Children_0_14_living_with_HIV 684
Children_orphaned_by_HIV_AIDS 684
Children_0_14_newly_infected_HIV 678
Incidence_tuberculosis_per_100000 55
Labor_force_total 160
Mortality_traffic_injury_100K 916
Population_female 118
Population_male 118
Population_total 4
Malaria_cases_reported 703
Suicide_mortality_per_100K 916
Tuberculosis_death_per_100K 55
Tuberculosis_case_detection 101
Tuberculosis_treatment_success_NewCases 345
kable(na_count_after)
Country_Name 0
Year_Code 68
Country_Code.x 68
Adults_15_living_HIV 579
Adults_Children_0_14_15_living_HIV 584
AIDS_estimated_deaths_UNAIDS 608
Adults_children_0_14_15_newly_infected_HIV 644
Adults_15_newly_infected_HIV 639
Children_0_14_living_with_HIV 752
Children_orphaned_by_HIV_AIDS 752
Children_0_14_newly_infected_HIV 746
Incidence_tuberculosis_per_100000 123
Labor_force_total 228
Mortality_traffic_injury_100K 984
Population_female 186
Population_male 186
Population_total 72
Malaria_cases_reported 771
Suicide_mortality_per_100K 984
Tuberculosis_death_per_100K 123
Tuberculosis_case_detection 169
Tuberculosis_treatment_success_NewCases 413
Country_Code.y 246
Latitude 246
Longtitude 246
#Covet the columns to Numeric

df2[, c(4:22)] <- sapply(df2[, c(4:22)], as.numeric)

glimpse(df2)
## Observations: 1,364
## Variables: 25
## $ Country_Name                               <chr> "Afghanistan", "Afg...
## $ Year_Code                                  <chr> "YR2015", "YR2013",...
## $ Country_Code.x                             <chr> "AFG", "AFG", "AFG"...
## $ Adults_15_living_HIV                       <dbl> 6700, 5600, 4700, 5...
## $ Adults_Children_0_14_15_living_HIV         <dbl> 6900, 5800, 4900, 5...
## $ AIDS_estimated_deaths_UNAIDS               <dbl> 500, 500, 500, 500,...
## $ Adults_children_0_14_15_newly_infected_HIV <dbl> 1000, 1000, 1000, 1...
## $ Adults_15_newly_infected_HIV               <dbl> 1000, 1000, 1000, 1...
## $ Children_0_14_living_with_HIV              <dbl> 500, 200, 200, 200,...
## $ Children_orphaned_by_HIV_AIDS              <dbl> 4200, 3700, 3200, 3...
## $ Children_0_14_newly_infected_HIV           <dbl> 100, 100, 100, 100,...
## $ Incidence_tuberculosis_per_100000          <dbl> 189.0, 189.0, 189.0...
## $ Labor_force_total                          <dbl> 9825616, 8916157, 8...
## $ Mortality_traffic_injury_100K              <dbl> 15.2, NA, NA, NA, N...
## $ Population_female                          <dbl> 16346869, 15398276,...
## $ Population_male                            <dbl> 17389625, 16333412,...
## $ Population_total                           <dbl> 33736494, 31731688,...
## $ Malaria_cases_reported                     <dbl> 86895, 39263, 77549...
## $ Suicide_mortality_per_100K                 <dbl> 5.5, NA, NA, NA, NA...
## $ Tuberculosis_death_per_100K                <dbl> 37.00, 41.00, 42.00...
## $ Tuberculosis_case_detection                <dbl> 58, 53, 51, 51, 53,...
## $ Tuberculosis_treatment_success_NewCases    <dbl> NA, 88, 88, 88, 87,...
## $ Country_Code.y                             <fctr> AFG, AFG, AFG, AFG...
## $ Latitude                                   <dbl> 33.0000, 33.0000, 3...
## $ Longtitude                                 <dbl> 65.0, 65.0, 65.0, 6...

Merging column lonitude and Latitude together for a better coordinate to be in maps (googlevis)

df2$Lat_Long = paste(df2$Latitude, df2$Longtitude, sep=":")

kable(head(df2))
Country_Name Year_Code Country_Code.x Adults_15_living_HIV Adults_Children_0_14_15_living_HIV AIDS_estimated_deaths_UNAIDS Adults_children_0_14_15_newly_infected_HIV Adults_15_newly_infected_HIV Children_0_14_living_with_HIV Children_orphaned_by_HIV_AIDS Children_0_14_newly_infected_HIV Incidence_tuberculosis_per_100000 Labor_force_total Mortality_traffic_injury_100K Population_female Population_male Population_total Malaria_cases_reported Suicide_mortality_per_100K Tuberculosis_death_per_100K Tuberculosis_case_detection Tuberculosis_treatment_success_NewCases Country_Code.y Latitude Longtitude Lat_Long
Afghanistan YR2015 AFG 6700 6900 500 1000 1000 500 4200 100 189 9825616 15.2 16346869 17389625 33736494 86895 5.5 37 58 NA AFG 33 65 33:65
Afghanistan YR2013 AFG 5600 5800 500 1000 1000 200 3700 100 189 8916157 NA 15398276 16333412 31731688 39263 NA 41 53 88 AFG 33 65 33:65
Afghanistan YR2011 AFG 4700 4900 500 1000 1000 200 3200 100 189 8050184 NA 14444001 15264598 29708599 77549 NA 42 51 88 AFG 33 65 33:65
Afghanistan YR2012 AFG 5200 5300 500 1000 1000 200 3500 100 189 8458402 NA 14912657 15784301 30696958 54840 NA 42 51 88 AFG 33 65 33:65
Afghanistan YR2014 AFG 6200 6400 500 1000 1000 500 4000 100 189 9397624 NA 15881092 16876928 32758020 145282 NA 41 53 87 AFG 33 65 33:65
Afghanistan YR2010 AFG 4300 4500 500 1000 1000 200 3000 100 189 7707349 15.3 14005473 14797694 28803167 69397 5.1 41 53 86 AFG 33 65 33:65
missmap(df2, legend = TRUE, col = c("wheat","darkred", col=c('yellow', 'darkgreen')), main ="Plot Showing The Missing Values Per Observation",
        y.cex = 0.8, x.cex = 0.8,csvar = NULL, tsvar =NULL, rank.order = TRUE)

mice_missing_val <- mice(df2, m=1, method='cart', printFlag=FALSE, where = is.na(df2))
mice_missing_val_comp <- complete(mice_missing_val, action='long', include=FALSE)
df2_cleaned <- na.omit(mice_missing_val_comp)
df2_cleaned <- df2_cleaned[, -c(1:2)]
kable(head(df2_cleaned))
Country_Name Year_Code Country_Code.x Adults_15_living_HIV Adults_Children_0_14_15_living_HIV AIDS_estimated_deaths_UNAIDS Adults_children_0_14_15_newly_infected_HIV Adults_15_newly_infected_HIV Children_0_14_living_with_HIV Children_orphaned_by_HIV_AIDS Children_0_14_newly_infected_HIV Incidence_tuberculosis_per_100000 Labor_force_total Mortality_traffic_injury_100K Population_female Population_male Population_total Malaria_cases_reported Suicide_mortality_per_100K Tuberculosis_death_per_100K Tuberculosis_case_detection Tuberculosis_treatment_success_NewCases Country_Code.y Latitude Longtitude Lat_Long
Afghanistan YR2015 AFG 6700 6900 500 1000 1000 500 4200 100 189 9825616 15.2 16346869 17389625 33736494 86895 5.5 37 58 87 AFG 33 65 33:65
Afghanistan YR2013 AFG 5600 5800 500 1000 1000 200 3700 100 189 8916157 15.3 15398276 16333412 31731688 39263 5.2 41 53 88 AFG 33 65 33:65
Afghanistan YR2011 AFG 4700 4900 500 1000 1000 200 3200 100 189 8050184 26.3 14444001 15264598 29708599 77549 5.9 42 51 88 AFG 33 65 33:65
Afghanistan YR2012 AFG 5200 5300 500 1000 1000 200 3500 100 189 8458402 23.7 14912657 15784301 30696958 54840 5.2 42 51 88 AFG 33 65 33:65
Afghanistan YR2014 AFG 6200 6400 500 1000 1000 500 4000 100 189 9397624 28.4 15881092 16876928 32758020 145282 5.1 41 53 87 AFG 33 65 33:65
Afghanistan YR2010 AFG 4300 4500 500 1000 1000 200 3000 100 189 7707349 15.3 14005473 14797694 28803167 69397 5.1 41 53 86 AFG 33 65 33:65
na_count_after_cleaning <-sapply(df2_cleaned, function(c) sum(is.na(c)))
kable(na_count_after_cleaning)
Country_Name 0
Year_Code 0
Country_Code.x 0
Adults_15_living_HIV 0
Adults_Children_0_14_15_living_HIV 0
AIDS_estimated_deaths_UNAIDS 0
Adults_children_0_14_15_newly_infected_HIV 0
Adults_15_newly_infected_HIV 0
Children_0_14_living_with_HIV 0
Children_orphaned_by_HIV_AIDS 0
Children_0_14_newly_infected_HIV 0
Incidence_tuberculosis_per_100000 0
Labor_force_total 0
Mortality_traffic_injury_100K 0
Population_female 0
Population_male 0
Population_total 0
Malaria_cases_reported 0
Suicide_mortality_per_100K 0
Tuberculosis_death_per_100K 0
Tuberculosis_case_detection 0
Tuberculosis_treatment_success_NewCases 0
Country_Code.y 0
Latitude 0
Longtitude 0
Lat_Long 0
kable(summary(df2_cleaned[, 4:22]))
Adults_15_living_HIV Adults_Children_0_14_15_living_HIV AIDS_estimated_deaths_UNAIDS Adults_children_0_14_15_newly_infected_HIV Adults_15_newly_infected_HIV Children_0_14_living_with_HIV Children_orphaned_by_HIV_AIDS Children_0_14_newly_infected_HIV Incidence_tuberculosis_per_100000 Labor_force_total Mortality_traffic_injury_100K Population_female Population_male Population_total Malaria_cases_reported Suicide_mortality_per_100K Tuberculosis_death_per_100K Tuberculosis_case_detection Tuberculosis_treatment_success_NewCases
Min. : 100 Min. : 100 Min. : 100 Min. : 100 Min. : 100 Min. : 100 Min. : 500 Min. : 100 Min. : 0.0 Min. :1.389e+05 Min. : 2.40 Min. :1.454e+05 Min. :1.341e+05 Min. :2.796e+05 Min. : 0 Min. : 0.000 Min. : 0.11 Min. : 15.00 Min. : 0.00
1st Qu.: 8300 1st Qu.: 8700 1st Qu.: 275 1st Qu.: 1000 1st Qu.: 1000 1st Qu.: 500 1st Qu.: 3500 1st Qu.: 100 1st Qu.: 28.0 1st Qu.:1.918e+06 1st Qu.:13.65 1st Qu.:2.199e+06 1st Qu.:2.194e+06 1st Qu.:4.350e+06 1st Qu.: 411 1st Qu.: 5.825 1st Qu.: 2.00 1st Qu.: 58.00 1st Qu.: 75.00
Median : 37000 Median : 40500 Median : 1500 Median : 2400 Median : 2000 Median : 1750 Median : 18500 Median : 500 Median : 100.0 Median :5.089e+06 Median :20.00 Median :5.661e+06 Median :5.620e+06 Median :1.131e+07 Median : 17549 Median : 8.400 Median : 9.30 Median : 76.00 Median : 83.00
Mean : 514638 Mean : 555964 Mean : 20816 Mean : 32101 Mean : 28040 Mean : 43017 Mean : 318681 Mean : 4333 Mean : 165.4 Mean :4.363e+07 Mean :19.90 Mean :4.838e+07 Mean :4.923e+07 Mean :9.763e+07 Mean : 485136 Mean :10.275 Mean :19.00 Mean : 70.25 Mean : 79.09
3rd Qu.: 170000 3rd Qu.: 180000 3rd Qu.: 6200 3rd Qu.: 9950 3rd Qu.: 9225 3rd Qu.: 12750 3rd Qu.: 90000 3rd Qu.: 1000 3rd Qu.: 235.0 3rd Qu.:1.233e+07 3rd Qu.:27.00 3rd Qu.:1.476e+07 3rd Qu.:1.574e+07 3rd Qu.:3.030e+07 3rd Qu.: 277235 3rd Qu.:12.575 3rd Qu.:30.00 3rd Qu.: 87.00 3rd Qu.: 87.00
Max. :33800000 Max. :36100000 Max. :1500000 Max. :2200000 Max. :1900000 Max. :2600000 Max. :19700000 Max. :300000 Max. :1246.0 Max. :3.406e+09 Max. :45.40 Max. :3.644e+09 Max. :3.708e+09 Max. :7.355e+09 Max. :11627473 Max. :35.300 Max. :99.00 Max. :110.00 Max. :100.00

Average Kids Orphaned By HIV/AIDS

All_Cen <- sqldf("select Country_Name, Year_Code, (Children_orphaned_by_HIV_AIDS/Population_total)*100 as 'Average_Orphaned_byHIV' from df2_cleaned where Average_Orphaned_byHIV >= 2 order by Average_Orphaned_byHIV desc limit 50" )
# Boxplot
g <- ggplot(All_Cen, aes(Year_Code, Average_Orphaned_byHIV)) + geom_boxplot(varwidth=T, fill="plum") + 
    labs(title="Box plot", 
         subtitle="Average_Orphaned_byHIV grouped by Year",
         caption="Source: The World Bank Data Bank",
         x="Year",
         y="Avg. Children Orphaned By HIV/AIDS")

ggplotly(g)
theme_set(theme_bw())

# Draw plot
p1 <- ggplot(All_Cen, aes(x=Country_Name, y=Average_Orphaned_byHIV)) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  labs(title="Top 50 Countries Where Children Orphaned By HIV/AIDS Per Pop (>= 2)", 
       subtitle="Country Vs Avg. Orphaned By HIV/AIDS ", 
       caption="source: The World Bank Data Bank") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

ggplotly(p1)
# Histogram on a Continuous (Numeric) Variable
g <- ggplot(All_Cen, aes(Average_Orphaned_byHIV)) + scale_fill_brewer(palette = "Spectral") + geom_histogram(aes(fill=Year_Code), 
                   bins=20, 
                   col="black", 
                   size=.05) +   # change number of bins
  labs(title="Top 50 Countries Where Children Orphaned By HIV/AIDS Per Pop (>= 2)", 
       subtitle="Average Children Orpaned By HIV/AIDS")


ggplotly(g)
ggplotly(ggplot(All_Cen, aes(x=Average_Orphaned_byHIV, y=Country_Name, fill=Year_Code)) +
geom_point(colour="black", size=.2, alpha=.4) +
scale_fill_brewer(palette="Blues", breaks=rev(levels(All_Cen$Year_Code))))
p <- ggplot(data =All_Cen, aes(x = Country_Name, y =Average_Orphaned_byHIV ))+ geom_boxplot(aes(color = Year_Code)) + facet_wrap(~Year_Code) + ggtitle("Top 50 Countries Where Children Orphaned By HIV/AIDS Per Pop (>= 2)") +
xlab("Countries") + ylab("Average_Orphaned_byHIV")
ggplotly(p)

INCIDENCE RATES

“Incidence rate or person-time rate is a measure of incidence that incorporates time directly into the denominator. A person-time rate is generally calculated from a long-term cohort follow-up study, wherein enrollees are followed over time and the occurrence of new cases of disease is documented.”" —CDC

\(Incidence\quad Rate\quad =\quad \frac { Number\quad of\quad newcases\quad of\quad disease\quad or\quad injury\quad during\quad specified\quad period }{ Time\quad each\quad person\quad was\quad observed,\quad totaled\quad for\quad all\quad persons }\)

Incident_Hiv_All_Ages <- sqldf("select Population_total, Lat_Long,Country_Name, Year_Code, (Adults_children_0_14_15_newly_infected_HIV/Population_total)*100 as 'HIV_Incident_Rate' from df2_cleaned where HIV_Incident_Rate >= 0.5 order by HIV_Incident_Rate desc limit 30" )

kable(head(Incident_Hiv_All_Ages))
Population_total Lat_Long Country_Name Year_Code HIV_Incident_Rate
2117361 -29.5:28.5 Lesotho YR2013 1.086258
1202843 -26.5:31.5 Swaziland YR2010 1.080773
2040551 -29.5:28.5 Lesotho YR2010 1.078140
2089928 -29.5:28.5 Lesotho YR2012 1.052668
2145785 -29.5:28.5 Lesotho YR2014 1.025266
2064166 -29.5:28.5 Lesotho YR2011 1.017360
Incident_Tuber_All_Ages <- sqldf("select Population_total, Lat_Long,Country_Name, Year_Code, (Tuberculosis_case_detection/Population_total)*100 as 'Tuberculocisis_Incident_Rate' from df2_cleaned where Tuberculocisis_Incident_Rate >= 0.01 order by Tuberculocisis_Incident_Rate, Year_Code desc limit 30" )
kable(head(Incident_Tuber_All_Ages)) 
Population_total Lat_Long Country_Name Year_Code Tuberculocisis_Incident_Rate
768514 5:-59 Guyana YR2015 0.0104097
763393 5:-59 Guyana YR2014 0.0104795
758081 5:-59 Guyana YR2013 0.0105530
753091 5:-59 Guyana YR2012 0.0106229
749100 5:-59 Guyana YR2011 0.0106795
746556 5:-59 Guyana YR2010 0.0107159
Hiv_Inc_Plot <- ggplot(data =Incident_Hiv_All_Ages, aes(y = Country_Name, x =HIV_Incident_Rate ))+ geom_boxplot(aes(color = Year_Code)) + facet_wrap(~Year_Code) + ggtitle("Top 30  Countries With HIV/AIDS Incident Rate (>= 0.5)") +
xlab("Countries") + ylab("HIV Incident Rates")
ggplotly(Hiv_Inc_Plot)
Hiv_Inc_Map <- gvisGeoChart(Incident_Hiv_All_Ages, locationvar ="Lat_Long",  hovervar ="Country_Name",sizevar = "HIV_Incident_Rate", colorvar = "Population_total",
                   options=list(displayMode="Markers", 
                   colorAxis="{colors:['purple', 'red', 'orange', 'grey', 'pink']}",
                   backgroundColor="lightblue"), chartid="EQ")
plot(Hiv_Inc_Map )
## starting httpd help server ... done
ggplotly(ggplot(Incident_Tuber_All_Ages, aes(x=Tuberculocisis_Incident_Rate, y=Country_Name, fill=Year_Code)) +
geom_point(colour="black", size=.2, alpha=.4) +
scale_fill_brewer(palette="Blues", breaks=rev(levels(All_Cen$Year_Code))))
p4 <- ggplot(data =Incident_Tuber_All_Ages, aes(y = Country_Name, x =Tuberculocisis_Incident_Rate ))+ geom_boxplot(aes(color = Year_Code)) + facet_wrap(~Year_Code) + ggtitle("Top Countries with High Tuberculosis Incidence Rate") +
ylab("Countries") + xlab("Year")
ggplotly(p4)
Tub_Inc_Map <- gvisGeoChart(Incident_Tuber_All_Ages, locationvar ="Lat_Long",  hovervar ="Country_Name",sizevar = "Tuberculocisis_Incident_Rate", colorvar = "Population_total",
                   options=list(displayMode="Markers", 
                   colorAxis="{colors:['purple', 'red', 'orange', 'grey', 'pink']}",
                   backgroundColor="lightblue"), chartid="ER")
plot(Tub_Inc_Map )
EQ
Data: Incident_Hiv_All_Ages • Chart ID: EQgoogleVis-0.6.2
R version 3.4.2 (2017-09-28) • Google Terms of UseDocumentation and Data Policy